In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cluster import KMeans
from bioinfokit.visuz import cluster
from sklearn.mixture import GaussianMixture
import statsmodels.api as sm
In [2]:
data = pd.read_csv("C:/Users/Nirmal/Documents/Python Scripts/mcdonalds.csv")
data
Out[2]:
yummy convenient spicy fattening greasy fast cheap tasty expensive healthy disgusting Like Age VisitFrequency Gender
0 No Yes No Yes No Yes Yes No Yes No No -3 61 Every three months Female
1 Yes Yes No Yes Yes Yes Yes Yes Yes No No 2 51 Every three months Female
2 No Yes Yes Yes Yes Yes No Yes Yes Yes No 1 62 Every three months Female
3 Yes Yes No Yes Yes Yes Yes Yes No No Yes 4 69 Once a week Female
4 No Yes No Yes Yes Yes Yes No No Yes No 2 49 Once a month Male
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1448 No Yes No Yes Yes No No No Yes No Yes -5 47 Once a year Male
1449 Yes Yes No Yes No No Yes Yes No Yes No 2 36 Once a week Female
1450 Yes Yes No Yes No Yes No Yes Yes No No 3 52 Once a month Female
1451 Yes Yes No No No Yes Yes Yes No Yes No 4 41 Every three months Male
1452 No Yes No Yes Yes No No No Yes No Yes -3 30 Every three months Male

1453 rows × 15 columns

In [3]:
data2 = data
In [4]:
data["Age"].value_counts()
Out[4]:
Age
55    53
60    38
37    37
59    36
57    36
52    36
58    35
36    35
49    34
62    34
50    34
32    33
44    32
56    32
64    32
53    31
26    31
24    30
35    30
51    30
47    30
42    30
23    30
39    29
29    28
34    28
30    28
38    27
40    27
31    27
25    26
33    26
61    26
67    26
48    26
43    25
27    25
63    25
54    24
41    23
22    23
65    23
45    22
20    21
46    19
28    18
66    17
21    16
18    16
70    15
69    14
68    13
19    10
71     1
Name: count, dtype: int64
In [5]:
data["VisitFrequency"].value_counts()
Out[5]:
VisitFrequency
Once a month             439
Every three months       342
Once a year              252
Once a week              235
Never                    131
More than once a week     54
Name: count, dtype: int64
In [6]:
data["Gender"].value_counts()
Out[6]:
Gender
Female    788
Male      665
Name: count, dtype: int64
In [7]:
labels = ['Male','Female']
sizes = [data.query('Gender == "Male"').Gender.count(),data.query('Gender == "Female"').Gender.count()]
In [8]:
colors = ['darkblue', 'cyan']
fig = go.Figure(data=[go.Pie(labels=labels, values=sizes)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors))
fig.show()
In [9]:
fig2 = sns.catplot(x="VisitFrequency", y="Age", hue= "Gender", data=data, palette="CMRmap", kind="strip", height=8, aspect=2)
plt.xticks(rotation='vertical')
plt.title('Visit Frequency Based on Age', fontsize=20)
plt.xlabel('Frequency of Visit')
plt.show()
C:\Users\Nirmal\AppData\Roaming\Python\Python311\site-packages\seaborn\axisgrid.py:123: UserWarning:

The figure layout has changed to tight

In [10]:
fig3 = sns.countplot(x ='Like', data = data, palette = "Reds")
plt.title('Distribution of Customers by Preference Levels', fontsize=12)
plt.xlabel('Likeness')
plt.show()
C:\Users\Nirmal\AppData\Local\Temp\ipykernel_25864\202406410.py:1: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


In [11]:
data = data.drop(["Like", "Age", "VisitFrequency", "Gender"], axis=1)
data
Out[11]:
yummy convenient spicy fattening greasy fast cheap tasty expensive healthy disgusting
0 No Yes No Yes No Yes Yes No Yes No No
1 Yes Yes No Yes Yes Yes Yes Yes Yes No No
2 No Yes Yes Yes Yes Yes No Yes Yes Yes No
3 Yes Yes No Yes Yes Yes Yes Yes No No Yes
4 No Yes No Yes Yes Yes Yes No No Yes No
... ... ... ... ... ... ... ... ... ... ... ...
1448 No Yes No Yes Yes No No No Yes No Yes
1449 Yes Yes No Yes No No Yes Yes No Yes No
1450 Yes Yes No Yes No Yes No Yes Yes No No
1451 Yes Yes No No No Yes Yes Yes No Yes No
1452 No Yes No Yes Yes No No No Yes No Yes

1453 rows × 11 columns

In [12]:
data = data.apply(LabelEncoder().fit_transform)
data.head(6)
Out[12]:
yummy convenient spicy fattening greasy fast cheap tasty expensive healthy disgusting
0 0 1 0 1 0 1 1 0 1 0 0
1 1 1 0 1 1 1 1 1 1 0 0
2 0 1 1 1 1 1 0 1 1 1 0
3 1 1 0 1 1 1 1 1 0 0 1
4 0 1 0 1 1 1 1 0 0 1 0
5 1 1 0 1 0 1 1 1 0 0 0
In [13]:
result = data.mean().round(2)
print(result)
yummy         0.55
convenient    0.91
spicy         0.09
fattening     0.87
greasy        0.53
fast          0.90
cheap         0.60
tasty         0.64
expensive     0.36
healthy       0.20
disgusting    0.24
dtype: float64
In [14]:
pca = PCA()
MD_pca = pca.fit_transform(data)

n_components = min(data.shape)  

std_dev = np.sqrt(pca.explained_variance_)
prop_variance = pca.explained_variance_ratio_
cum_prop = np.cumsum(prop_variance)

results = pd.DataFrame({
    'Standard deviation': std_dev,
    'Proportion of Variance': prop_variance,
    'Cumulative Proportion': cum_prop
}, index=[f'PC{i+1}' for i in range(n_components)])

print(results.round(4))
      Standard deviation  Proportion of Variance  Cumulative Proportion
PC1               0.7570                  0.2994                 0.2994
PC2               0.6075                  0.1928                 0.4922
PC3               0.5046                  0.1330                 0.6253
PC4               0.3988                  0.0831                 0.7084
PC5               0.3374                  0.0595                 0.7679
PC6               0.3103                  0.0503                 0.8182
PC7               0.2897                  0.0438                 0.8620
PC8               0.2751                  0.0395                 0.9016
PC9               0.2653                  0.0368                 0.9383
PC10              0.2488                  0.0324                 0.9707
PC11              0.2369                  0.0293                 1.0000
In [15]:
loadings = pca.components_
num_pc = pca.n_features_in_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_data = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_data['feature'] = data.columns.values
loadings_data = loadings_data.set_index('feature')
print(loadings_data.round(2))
             PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10  PC11
feature                                                                     
yummy      -0.48  0.36 -0.30  0.06 -0.31  0.17 -0.28  0.01  0.57 -0.11  0.05
convenient -0.16  0.02 -0.06 -0.14  0.28 -0.35 -0.06 -0.11 -0.02 -0.67 -0.54
spicy      -0.01  0.02 -0.04  0.20  0.07 -0.36  0.71  0.38  0.40 -0.08  0.14
fattening   0.12 -0.03 -0.32 -0.35 -0.07 -0.41 -0.39  0.59 -0.16 -0.01  0.25
greasy      0.30 -0.06 -0.80  0.25  0.36  0.21  0.04 -0.14 -0.00  0.01  0.00
fast       -0.11 -0.09 -0.06 -0.10  0.11 -0.59 -0.09 -0.63  0.17  0.24  0.34
cheap      -0.34 -0.61 -0.15  0.12 -0.13 -0.10 -0.04  0.14  0.08  0.43 -0.49
tasty      -0.47  0.31 -0.29 -0.00 -0.21 -0.08  0.36 -0.07 -0.64  0.08  0.02
expensive   0.33  0.60  0.02  0.07 -0.00 -0.26 -0.07  0.03  0.07  0.45 -0.49
healthy    -0.21  0.08  0.19  0.76  0.29 -0.18 -0.35  0.18 -0.19 -0.04  0.16
disgusting  0.37 -0.14 -0.09  0.37 -0.73 -0.21 -0.03 -0.17 -0.07 -0.29 -0.04
In [16]:
pca_scores = PCA().fit_transform(data)

cluster.biplot(cscore=pca_scores, loadings=loadings, labels=data.columns.values, var1=round(pca.explained_variance_ratio_[0]*100, 2),
    var2=round(pca.explained_variance_ratio_[1]*100, 2),show=True,dim=(10,5))
In [17]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=42)
    kmeans.fit(data)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
In [18]:
kmeans = KMeans(n_clusters=4, random_state=0)
data['Cluster'] = kmeans.fit_predict(MD_pca)
data['PCA1'] = MD_pca[:, 0]
data['PCA2'] = MD_pca[:, 1]
centroids = data.groupby('Cluster').mean()[['PCA1', 'PCA2']].values
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=data, palette='viridis', s=50, edgecolor='k')
plt.scatter(centroids[:, 0], centroids[:, 1], s=100, c='red', label='Centroid', marker='X')

plt.legend()
plt.title('Customer Clusters')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.show()
C:\Users\Nirmal\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

In [19]:
data2["Like"].value_counts()
Out[19]:
Like
 3    229
 2    187
 0    169
 4    160
 1    152
-5    152
 5    143
-3     73
-4     71
-2     59
-1     58
Name: count, dtype: int64
In [20]:
categorical_columns = data2[['yummy', 'convenient', 'spicy', 'fattening', 'greasy', 'fast',
                                                  'cheap', 'tasty', 'expensive', 'healthy', 'disgusting']]

label_encoder = LabelEncoder()

for col in categorical_columns:
    data2[col] = label_encoder.fit_transform(data2[col])

print(data2.head(6))
   yummy  convenient  spicy  fattening  greasy  fast  cheap  tasty  expensive  \
0      0           1      0          1       0     1      1      0          1   
1      1           1      0          1       1     1      1      1          1   
2      0           1      1          1       1     1      0      1          1   
3      1           1      0          1       1     1      1      1          0   
4      0           1      0          1       1     1      1      0          0   
5      1           1      0          1       0     1      1      1          0   

   healthy  disgusting  Like  Age      VisitFrequency  Gender  
0        0           0    -3   61  Every three months  Female  
1        0           0     2   51  Every three months  Female  
2        1           0     1   62  Every three months  Female  
3        0           1     4   69         Once a week  Female  
4        1           0     2   49        Once a month    Male  
5        0           0     2   55  Every three months    Male  
In [21]:
data2 = data2.drop(["Age", "VisitFrequency", "Gender"], axis = 1)
data2
Out[21]:
yummy convenient spicy fattening greasy fast cheap tasty expensive healthy disgusting Like
0 0 1 0 1 0 1 1 0 1 0 0 -3
1 1 1 0 1 1 1 1 1 1 0 0 2
2 0 1 1 1 1 1 0 1 1 1 0 1
3 1 1 0 1 1 1 1 1 0 0 1 4
4 0 1 0 1 1 1 1 0 0 1 0 2
... ... ... ... ... ... ... ... ... ... ... ... ...
1448 0 1 0 1 1 0 0 0 1 0 1 -5
1449 1 1 0 1 0 0 1 1 0 1 0 2
1450 1 1 0 1 0 1 0 1 1 0 0 3
1451 1 1 0 0 0 1 1 1 0 1 0 4
1452 0 1 0 1 1 0 0 0 1 0 1 -3

1453 rows × 12 columns

In [22]:
perception_vars = data2.columns[0:11] 
formula = 'Like ~ ' + ' + '.join(perception_vars)
print(formula)
Like ~ yummy + convenient + spicy + fattening + greasy + fast + cheap + tasty + expensive + healthy + disgusting
In [23]:
X = data2[perception_vars]
y = data2['Like']
In [24]:
reg_mix = GaussianMixture(n_components=2, n_init = 10, random_state=1234)
cluster_labels = reg_mix.fit_predict(np.column_stack((X, y.values.reshape(-1, 1))))
print("Cluster sizes:")
print(np.bincount(cluster_labels))
print(f"Convergence after {reg_mix.n_iter_} iterations")
Cluster sizes:
[ 393 1060]
Convergence after 8 iterations
In [25]:
for cluster in [0, 1]:
    print(f"\nCluster {cluster + 1}")
    X_cluster = sm.add_constant(X[cluster_labels == cluster])
    y_cluster = y[cluster_labels == cluster]
    model = sm.OLS(y_cluster, X_cluster).fit()
    print(model.summary())
Cluster 1
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Like   R-squared:                       0.503
Model:                            OLS   Adj. R-squared:                  0.489
Method:                 Least Squares   F-statistic:                     35.06
Date:                Mon, 26 Aug 2024   Prob (F-statistic):           2.45e-51
Time:                        20:11:15   Log-Likelihood:                -853.81
No. Observations:                 393   AIC:                             1732.
Df Residuals:                     381   BIC:                             1779.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.6131      0.516     -5.064      0.000      -3.628      -1.599
yummy          3.2997      0.375      8.807      0.000       2.563       4.036
convenient     0.9480      0.285      3.325      0.001       0.387       1.509
spicy         -0.0740      0.384     -0.193      0.847      -0.829       0.680
fattening      0.0518      0.475      0.109      0.913      -0.882       0.986
greasy         0.0906      0.289      0.313      0.754      -0.479       0.660
fast           0.4125      0.315      1.309      0.191      -0.207       1.032
cheap         -0.0054      0.315     -0.017      0.986      -0.625       0.614
tasty          1.3688      0.335      4.089      0.000       0.711       2.027
expensive     -0.0361      0.309     -0.117      0.907      -0.644       0.571
healthy        0.3812      0.447      0.852      0.395      -0.498       1.261
disgusting    -1.9545      0.466     -4.195      0.000      -2.871      -1.038
==============================================================================
Omnibus:                        3.824   Durbin-Watson:                   1.982
Prob(Omnibus):                  0.148   Jarque-Bera (JB):                3.569
Skew:                           0.220   Prob(JB):                        0.168
Kurtosis:                       3.157   Cond. No.                         13.1
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Cluster 2
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Like   R-squared:                       0.414
Model:                            OLS   Adj. R-squared:                  0.409
Method:                 Least Squares   F-statistic:                     82.31
Date:                Mon, 26 Aug 2024   Prob (F-statistic):          2.49e-115
Time:                        20:11:15   Log-Likelihood:                -2127.0
No. Observations:                1060   AIC:                             4274.
Df Residuals:                    1050   BIC:                             4324.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
yummy          2.0706      0.150     13.770      0.000       1.776       2.366
convenient    -0.3373      0.302     -1.116      0.265      -0.930       0.256
spicy         -0.4944      0.197     -2.505      0.012      -0.882      -0.107
fattening     -0.5010      0.172     -2.912      0.004      -0.839      -0.163
greasy        -0.3650      0.118     -3.089      0.002      -0.597      -0.133
fast           0.2400      0.222      1.082      0.279      -0.195       0.675
cheap          0.0656      0.174      0.378      0.705      -0.275       0.406
tasty          1.3624      0.166      8.202      0.000       1.036       1.688
expensive      0.0112      0.181      0.062      0.951      -0.344       0.366
healthy        0.4613      0.143      3.230      0.001       0.181       0.742
disgusting          0          0        nan        nan           0           0
==============================================================================
Omnibus:                       70.223   Durbin-Watson:                   2.009
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               84.166
Skew:                          -0.634   Prob(JB):                     5.29e-19
Kurtosis:                       3.548   Cond. No.                          inf
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is      0. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
C:\ProgramData\anaconda3\Lib\site-packages\statsmodels\regression\linear_model.py:1965: RuntimeWarning:

divide by zero encountered in scalar divide

In [26]:
comp1_coef = [3.2997, 0.9480, -0.0740,  0.0518,  0.0906, 0.4125,  -0.0054, 1.3688, -0.0361, 0.3812, -1.9545]
comp1_se = [ 0.375,  0.285, 0.384, 0.475,  0.289, 0.315, 0.315, 0.335, 0.309, 0.447, 0.466 ]

comp2_coef = [2.0706, -0.3373, -0.4944, -0.5010, -0.3650, 0.2400, 0.0656, 1.3624,  0.0112, 0.4613, 0]
comp2_se = [ 0.150, 0.302, 0.197, 0.172, 0.118, 0.222, 0.174, 0.166, 0.181, 0.143, 0]
In [27]:
variables = ['yummy', 'convenient', 'spicy', 'fattening', 'greasy', 
             'fast', 'cheap', 'tasty', 'expensive', 'healthy', 'disgusting']
In [30]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8), sharey=True)
fig.suptitle('Coefficient Estimates for Segments 1 and 2')

def plot_coef(ax, coef, se, title):
    y_pos = np.arange(len(variables))
    ax.barh(y_pos, coef, xerr=se, align='center', capsize=5, color='skyblue', edgecolor='black')
    ax.axvline(x=0, color='k', linestyle='--')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(variables)
    ax.set_title(title)
    ax.set_xlim(-6, 6)

plot_coef(ax1, comp1_coef, comp1_se, 'Segment 1')
plot_coef(ax2, comp2_coef, comp2_se, 'Segment 2')

ax1.invert_yaxis()  
plt.tight_layout()
plt.show()
In [ ]: